#!/usr/bin/env python # coding: utf-8 # # Reading data sets # - IC50 # - Genomic Featuers # - Drug Decoder # In[1]: get_ipython().run_line_magic('pylab', 'inline') matplotlib.rcParams['figure.figsize'] = (10,6) # In[2]: # let us import some functions from gdsctools import IC50, DrugDecode, GenomicFeatures # and data sets from gdsctools import ic50_test, genomic_features_test from gdsctools.datasets import testing # ## IC50 # The first type of data set to be used in the anlaysis is the matrix of IC50. There is a test file called **ic50_test** that gives the location of such a file # # In[3]: ic50 = IC50(ic50_test) # In[4]: print(ic50) # In[5]: data = ic50.plot_ic50_count(marker='o') title("Count of valid IC50 values per drug") # In[6]: data = ic50.hist() # In[7]: drug_to_drop = ['Drug_999_IC50', 'Drug_1047_IC50', 'Drug_1049_IC50', 'Drug_1050_IC50', 'Drug_1052_IC50', 'Drug_1053_IC50'] dummy = ic50.drop_drugs(drug_to_drop) data = ic50.hist() # ## Genomic Features # In[8]: f = GenomicFeatures() # default from the package # This is equivalent to # In[9]: f = GenomicFeatures(genomic_features_test) # In[10]: print(f) # Note that this GenomicFeatures matrix must have 3 special columns # to provide the sample name, Tissue Factor Value and MSI factor value. # Then all features. # In[11]: f.df.iloc[0:3] # In[12]: df = f.plot() # In[13]: groups = f.df.groupby('TISSUE_FACTOR').groups to_remove = [] for tissue in groups.keys(): if len(groups[tissue])<40: to_remove.append(tissue) # In[14]: info = f.drop_tissue_in(to_remove) f.plot() # ## Drug Decoder # GDSCTools provides an IC50 test file (ic50_test). The drug identifiers are # usually encoded with a unique identifier that have no meaning. A decoder # file may be provided. for example, we provide the drug_test data set # In[15]: print(testing.drug_test_csv) # In[16]: dd = DrugDecode(testing.drug_test_csv) print(dd) # It can be used to retrive the name and target of the drug # In[17]: dd.get_name('Drug_1047_IC50') # In[18]: dd.get_target('Drug_1047_IC50') # In[ ]: